In [1]:
from utils import Activities, Users

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"
import plotly.figure_factory as ff

import numpy as np

from config_vars import month_map, month_order
In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None
In [3]:
data_folder = "../data/"

#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)
In [ ]:
 

Data Loading and Cleaning

In [4]:
users = Users(f"{data_folder}users.csv")
students = users.df.query("student == 1")
supervisors = users.df.query("supervisor == 1")
teachers  = users.df.query("teacher == 1")


activities = Activities(f"{data_folder}activities.csv")

activities.drop_over_year() # TOASK: a Nicola utenti oltre l'anno
activities.map_month(month_map,month_order) # map and sort months
activities.save_raw() # create a copy of non cleaned dataframe
activities.fillna() # fill some columns with 0 instead of NaN
activities.year_to_cat()


feedbacks = pd.read_csv(f"{data_folder}activities_feedbacks_info.csv")
feedbacks = feedbacks.loc[feedbacks['ac_activity'].isin(activities.df['ac_activity']),:]

df_months = pd.read_csv(f"{data_folder}months.csv").fillna(0)
df_months.replace({'month':month_map},inplace=True)
df_months['month_order'] = df_months['month'].map(dict(zip(month_order,range(1,13))))
df_months.sort_values(by=['month_order','activity_school_year'],inplace=True)

y_users = df_months['n_users_per_year']
df_months['norm_avg_n_user_recipes']        = df_months['n_recipes'].divide(y_users)
df_months['norm_avg_n_user_experiences']    = df_months['n_experiences'].divide(y_users)
df_months['norm_avg_n_activities']          = df_months['n_activities'].divide(y_users)

logins = pd.read_csv(f"{data_folder}log_logins.csv")

Data Exploring

Overview:

In [5]:
n_supervisors = len(supervisors)
n_students    = len(students)
n_teachers    = len(users.df.query("teacher == 1"))

print(f"There are {n_students} students, {n_supervisors} supervisors, and {n_teachers} teachers")
There are 546 students, 226 supervisors, and 24 teachers
In [6]:
activities.plot_dashboard()

Number of activities

In [7]:
hist_data = activities.df.groupby(by=['creation_month','activity_school_year','start_year'],sort=False).size().reset_index(name="count")
hist_data = activities.sort_month(hist_data,'creation_month',['activity_school_year','start_year'])

fig = hist_data.plot(x="creation_month", y=["count"], facet_col='activity_school_year', color="start_year")
fig.update_layout(
    title="Number of activities per start year",
    title_x=0.5,
    yaxis_title="# of activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Number of activities per student

In [8]:
fig = students.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_activities", color="start_year",
    hover_data =["us_user","n_activities","start_year"]
)

fig.update_layout(
    title="Number of activities per student per start year",
    title_x=0.5,
    yaxis_title="Activities per student",
)

fig.update_yaxes(range=[0, 600])
fig.show()

Number of active users

In [9]:
hist_data = activities.df.groupby(by=['activity_school_year','start_year','us_user'],sort=False).size().reset_index(name="count")
active_users_per_year = hist_data.query("count > 0").drop("us_user",axis=1).groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count").sort_values(by=["start_year","activity_school_year"])
In [10]:
fig = active_users_per_year.plot(x="start_year", y="count", facet_col='activity_school_year')
fig.update_layout(
    title="Number of active users per year",
    title_x=0.5,
    yaxis_title="# of actitve users",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45, type="category")
fig.update_traces(line_traces)

fig.show()

Number of feedbacks

In [11]:
hist_data = feedbacks.groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count")
In [12]:
fig = hist_data.plot(x="start_year", y=["count"], facet_col='activity_school_year')
fig.update_layout(
    title="Number of feedbacks per start year",
    title_x=0.5,
    yaxis_title="# of feedbacks",
    showlegend=False,
)
fig.update_xaxes(title=dict(text=""),tickangle=45,type="category")
fig.update_traces(line_traces)

fig.show()
In [13]:
feedbacks['response_year'] = pd.DatetimeIndex(feedbacks['response_date']).year
data_hist = feedbacks.groupby(by=['recipient','response_year']).size().reset_index(name="count")

fig = data_hist.sort_values(by=["response_year"]).plot.box(
    x="response_year",y="count", color="response_year", 
    hover_data =["recipient","response_year","count"]
)

fig.update_layout(
    title="Number of feedbacks per supervisor",
    title_x=0.5,
    yaxis_title="Feedbacks per supervisor",
)

fig.show()

Number of activities

In [7]:
hist_data = activities.df.groupby(by=['creation_month','activity_school_year','start_year'],sort=False).size().reset_index(name="count")
hist_data = activities.sort_month(hist_data,'creation_month',['activity_school_year','start_year'])

fig = hist_data.plot(x="creation_month", y=["count"], facet_col='activity_school_year', color="start_year")
fig.update_layout(
    title="Number of activities per start year",
    title_x=0.5,
    yaxis_title="# of activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Number of activities per student

In [8]:
fig = students.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_activities", color="start_year",
    hover_data =["us_user","n_activities","start_year"]
)

fig.update_layout(
    title="Number of activities per student per start year",
    title_x=0.5,
    yaxis_title="Activities per student",
)

fig.update_yaxes(range=[0, 600])
fig.show()

Number of active users

In [ ]:
 
In [22]:
hist_data = activities.df.groupby(by=['activity_school_year','start_year','us_user'],sort=False).size().reset_index(name="count")
active_users_per_year = hist_data.query("count > 0").drop("us_user",axis=1).groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count").sort_values(by=["start_year","activity_school_year"])
activities_per_user = hist_data.drop("us_user",axis=1).groupby(by=['activity_school_year','start_year'],sort=False).mean().reset_index().sort_values(by=["start_year","activity_school_year"]).rename(columns={'count':'activities_per_user'})
df = active_users_per_year.merge(activities_per_user, on=["activity_school_year","start_year"],how="left")
In [52]:
fig = df.plot.scatter(x="start_year", y="count", facet_col='activity_school_year', color="activities_per_user")
fig.update_layout(
    title="Number of active users per year",
    title_x=0.5,
    yaxis_title="# of actitve users",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45, type="category")
fig.update_traces(line_traces)
fig.update_traces(line_traces,line=dict(color="gray"))

fig.show()

Number of feedbacks

In [31]:
hist_data = feedbacks.groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count")
In [32]:
fig = hist_data.plot(x="start_year", y=["count"], facet_col='activity_school_year')
fig.update_layout(
    title="Number of feedbacks per start year",
    title_x=0.5,
    yaxis_title="# of feedbacks",
    showlegend=False,
)
fig.update_xaxes(title=dict(text=""),tickangle=45,type="category")
fig.update_traces(line_traces)

fig.show()
In [33]:
feedbacks['response_year'] = pd.DatetimeIndex(feedbacks['response_date']).year
data_hist = feedbacks.groupby(by=['recipient','response_year']).size().reset_index(name="count")

fig = data_hist.sort_values(by=["response_year"]).plot.box(
    x="response_year",y="count", color="response_year", 
    hover_data =["recipient","response_year","count"]
)

fig.update_layout(
    title="Number of feedbacks per supervisor",
    title_x=0.5,
    yaxis_title="Feedbacks per supervisor",
)

fig.show()

Number of feedback responses per supervisor

In [34]:
fig = supervisors.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_feedback_responses", color="start_year",
    hover_data =["us_user","user_name","n_feedback_responses","start_year"]
)

fig.update_layout(
    title="Number of feedbacks per supervisor per start year",
    title_x=0.5,
    yaxis_title="Feedbacks per supervisor",
)

fig.show()

Number of feedback requests per student

In [35]:
fig = students.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_feedback_requests", color="start_year",
    hover_data =["us_user","user_name","n_feedback_requests","start_year"]
)

fig.update_layout(
    title="Number of feedback requests per student",
    title_x=0.5,
    yaxis_title="Requests per student",
)

fig.show()

Activities images

In [36]:
fig = activities.df.sort_values(by=["start_year","activity_school_year"]).plot.box(
    x="activity_school_year",y="n_images", 
    color="start_year", 
    hover_data =["ac_activity","ac_title","activity_total_length","start_year"]
)

fig.update_layout(
    title="Number of images per activity",
    title_x=0.5,
)

fig.show()

Activities total lenght

In [37]:
fig = activities.df.sort_values(by=["start_year","activity_school_year"]).plot.box(
    x="activity_school_year",y="activity_total_length", 
    color="start_year", 
    hover_data =["ac_activity","ac_title","activity_total_length","start_year"]
)

fig.update_layout(
    title="Total lenghts",
    title_x=0.5,
)

fig.update_yaxes(range=[0, 600])
fig.show()

Months trends:

Activities per month

Creation

In [38]:
fig = df_months.plot(x="month", y=["n_activities"],facet_col='activity_school_year')
fig.update_layout(
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="# activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [39]:
fig = df_months.plot(x="month", y=["norm_avg_n_activities"],facet_col='activity_school_year')
fig.update_layout(
    title="Number of activities per users per month",
    title_x=0.5,
    yaxis_title="average # of activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Edits

In [40]:
fig = df_months.plot(x="month", y=["n_edits"],facet_col='activity_school_year')
fig.update_layout(
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="# activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Curriculum

In [41]:
fig = activities.sort_month(
        activities.df[['last_edit_month','activity_school_year','in_curriculum']],
        'last_edit_month',['activity_school_year']
).hist(x="last_edit_month", y="in_curriculum", 
        facet_col='activity_school_year',
)

fig.update_layout(
    title="# of activities in curriculum per month",
    title_x=0.5,
    yaxis_title="# activities in curriculum",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)


fig.show()

Feedbacks

In [42]:
feedbacks['response_month'] = pd.DatetimeIndex(feedbacks['response_date']).month
feedbacks.replace({'response_month':month_map},inplace=True)
feedbacks['request_month'] = pd.DatetimeIndex(feedbacks['request_date']).month
feedbacks.replace({'request_month':month_map},inplace=True)
In [43]:
hist_requests = activities.sort_month(
    feedbacks.groupby(['activity_school_year','request_month']).size().reset_index(name="count"),
    'request_month',['activity_school_year']
).rename(columns={'request_month':'month'}).assign(type = 'request')

hist_responses = activities.sort_month(
    feedbacks.groupby(['activity_school_year','response_month']).size().reset_index(name="count"),
    'response_month',['activity_school_year']
).rename(columns={'response_month':'month'}).assign(type = 'response')

hist_data = hist_requests.append(hist_responses)
In [44]:
fig = hist_data.plot(
    x="month", y=["count"],facet_col='activity_school_year', color="type"
)

fig.update_layout(
    title="Total number of feedback requests and responses per month",
    title_x=0.5,
    yaxis_title="count",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [45]:
feedbacks['has_response'] = feedbacks['response_date'].notnull()
In [46]:
a = feedbacks.groupby(["request_month","activity_school_year"]).sum()['has_response'].div(
        feedbacks.groupby(["request_month","activity_school_year"]).size()
    ).reset_index(name="ratio")

total = feedbacks.groupby(["request_month","activity_school_year"]).size()
In [47]:
fig = activities.sort_month(
    feedbacks.groupby(["request_month","activity_school_year"]).sum()['has_response'].div(
        feedbacks.groupby(["request_month","activity_school_year"]).size()
    ).reset_index(name="ratio").assign(
        total = feedbacks.groupby(["request_month","activity_school_year"]).size().values.astype('int64')
    ),'request_month',['activity_school_year']
).plot.scatter(
    x="request_month", y=["ratio"],facet_col='activity_school_year', 
    size="total", 
    color="total",
    size_max=60
)

fig.update_layout(
    title="Ratio responses/requests",
    title_x=0.5,
    yaxis_title="ratio",
    showlegend = False
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces,line=dict(color="gray"))

fig.show()

Feedbacks

Ratio

In [85]:
tot_req = feedbacks.groupby(['recipient']).size().reset_index(name="received").rename(columns={'recipient':'us_user'})
ratio = tot_req.merge(feedbacks.query("has_response == True").groupby(['recipient']).size().reset_index(name="responsed").rename(columns={'recipient':'us_user'}),on="us_user",how="left").fillna(0)
ratio['ratio'] = ratio['responsed'].div(ratio['received'])
ratio = ratio.merge(supervisors[['us_user','start_year','user_name']],on="us_user",how="right").dropna()
ratio.sort_values(by="ratio",inplace=True)
In [86]:
fig = ratio.plot.scatter(x="us_user",y="ratio",color="received")
fig.update_layout(
    title="Ratio responses/requests per each supervisor", 
    title_x=0.5,
    xaxis_title='supervisor',
    xaxis_type='category'
)
fig.update_traces(mode='markers',opacity=0.8,)

fig.add_trace(ratio.plot.bar(x="us_user",y="ratio",color="received").data[0])

fig.update_xaxes(showticklabels=False)
fig.show()
In [87]:
ratio['norm'] = ratio['responsed']*ratio['ratio']**2
bests=ratio.sort_values(by=["norm"],ascending=False).head(10)
In [88]:
bests
Out[88]:
us_user received responsed ratio start_year user_name norm
43 336 274.0 221.0 0.806569 2014 Agustoni Loris 143.772457
82 663 276.0 199.0 0.721014 2016 Carlo Giovio 103.452518
118 500485 107.0 103.0 0.962617 2018 Mirjam Trinkler 95.443008
154 500994 74.0 73.0 0.986486 2019 René Studer 71.040358
107 500352 67.0 63.0 0.940299 2018 Alberto Fissore 55.702161
125 500720 80.0 68.0 0.850000 2019 Gabriel Leleu 49.130000
114 500424 75.0 63.0 0.840000 2018 Roberto Danesi 44.452800
93 500093 38.0 38.0 1.000000 2018 Tosi Loris 38.000000
126 500722 43.0 37.0 0.860465 2019 Antonio Bramante 27.394808
61 424 41.0 35.0 0.853659 2015 Roberto Danesi 25.505651
In [110]:
resps = feedbacks.query("has_response == True").groupby(by=['recipient','response_year']).size().reset_index(name="responsed")
In [117]:
fig = resps.plot(x="response_year",y="responsed",color="recipient")
fig.update_layout(
    title="Responses per supervisor per year", 
    title_x=0.5,
    xaxis_title='response year',
    yaxis_title='# feedbacks',
    xaxis_type='category'
)
fig.update_traces(line_traces)

Edits and delay

In [89]:
df_feedbacks_info = feedbacks
In [90]:
no_answer = df_feedbacks_info['student_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without student grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 361/4739 without student grade (7.62%)
In [91]:
no_answer = df_feedbacks_info['response_date'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without answer ({round(no_answer/len(df_feedbacks_info),3)*100}%)')
There are 2774/4739 without answer (58.5%)
In [92]:
no_answer = df_feedbacks_info['supervisor_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without supervisor grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 2841/4739 without supervisor grade (59.95%)
In [93]:
response_no_grade = (df_feedbacks_info['response_date'].notnull() & df_feedbacks_info['supervisor_grade'].isnull()).sum()
grade_no_response = (df_feedbacks_info['response_date'].isnull() & df_feedbacks_info['supervisor_grade'].notnull()).sum()

print(f'There are {response_no_grade} responses without grades')
There are 151 responses without grades
In [94]:
grade_no_response #TODO: VERIFICARE QUESTO!
Out[94]:
84
In [95]:
df_feedbacks_info.dropna(inplace=True)
df_feedbacks_info['delay_days'] = df_feedbacks_info['delay_hours']/24
In [96]:
df_hist = df_feedbacks_info.groupby(["activity_school_year","times_before_answer"]).count()[['ac_activity']].reset_index().rename(columns={'ac_activity': 'count'})


fig = df_hist.plot.hist(x="times_before_answer", y='count', 
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    xaxis_type='category',
    title="Requests before a feedback",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="Number of requests before the response",
    coloraxis_showscale=False,
)

fig.show()
In [97]:
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days",  nbins=30,
                         color = "activity_school_year" )

fig.update_layout(
    barmode='group',
    title="Histogram: days before an answer",
    title_x=0.5,
    yaxis_title="Count",
    xaxis_title="days before an answer",
    coloraxis_showscale=False,
)

fig.show()
In [98]:
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days",  nbins=30,
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    title="Histogram: normalized number of days before an answer",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="days before an answer",
    coloraxis_showscale=False,
)

fig.show()
In [99]:
fig = df_feedbacks_info[df_feedbacks_info['edits_after']<5].plot.hist(x="edits_after",  nbins=5,
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    title="Histogram: normalized number of edits after a feedback",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="# of edits after an answer per feedback request",
)

fig.show()
In [100]:
df_feedbacks_info['has_edit_after'] = (df_feedbacks_info['edits_after']>0).astype(int)
In [101]:
df_count = df_feedbacks_info.groupby(["activity_school_year","supervisor_grade","has_edit_after"]).size().reset_index(name='count')

c = (pd.core.reshape.util.cartesian_product([df_count['activity_school_year'].unique(),df_count['supervisor_grade'].unique(),df_count['has_edit_after'].unique()]))
c = pd.DataFrame(dict(activity_school_year=c[0],supervisor_grade=c[1],has_edit_after=c[2]))
c = c.set_index(["activity_school_year","supervisor_grade","has_edit_after"]).join(df_count.set_index(["activity_school_year","supervisor_grade","has_edit_after"])).reset_index().sort_values(by=['activity_school_year','supervisor_grade'])
c.fillna(0,inplace=True)

df_feedback_edits =  c[c['has_edit_after']==True].drop("has_edit_after",axis=1).rename(columns={'count':'edit'})
df_feedback_edits['no_edit'] = c[c['has_edit_after']==False]['count'].tolist()

df_feedback_edits['ratio'] = df_feedback_edits['edit'].div(df_feedback_edits['no_edit']+df_feedback_edits['edit']).fillna(0)
In [102]:
fig = df_feedback_edits.plot(x="supervisor_grade", y=["edit"],
                             line_group='activity_school_year', color='activity_school_year')
fig.update_layout(
    title="# activities that have been edit after a feedback per grade",
    title_x=0.5,
    yaxis_title="# activities",
    xaxis_title="supervisor grade",
)
fig.update_traces(line_traces)

fig.show()

Logins

In [103]:
dayofweek_map = {1:'Sun',2:'Mon',3:'Tue',4:'Wed',5:'Thu',6:'Fri',7:'Sat'}

logins_students = logins.merge(students,on="us_user",how="inner")
logins_supervisors = logins.merge(supervisors,on="us_user",how="inner")
logins_teachers = logins.merge(teachers,on="us_user",how="inner")

logins['date'] = logins['date'].astype('datetime64')
In [104]:
fig = logins_students.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
    {'dayofweek':dayofweek_map}
).plot(
    x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)

fig.update_layout(
    title="Apprentices logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)

fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()
In [105]:
fig = logins_supervisors.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
    {'dayofweek':dayofweek_map}
).plot(
    x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)

fig.update_layout(
    title="Supervisors logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)

fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()
In [106]:
fig = logins_teachers.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
    {'dayofweek':dayofweek_map}
).plot(
    x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)

fig.update_layout(
    title="Teachers logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)

fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()